Comparisons are made with the text features ignored. The features used here are therefore:
import json
import pickle
from pathlib import Path
# Standard plotly imports
import plotly.graph_objs as go
import plotly.plotly as py
from plotly.offline import iplot
# Data science imports
import pandas as pd
import numpy as np
# Cufflinks wrapper on plotly
import cufflinks
cufflinks.go_offline()
# Set the global theme for cufflinks
cufflinks.set_config_file(world_readable=True, theme='solar', offline=True)
cur_dir = !pwd
project_dir = Path(cur_dir[0]).resolve().parents[0]
features_path = project_dir / 'src' / 'features'
predictions_path = project_dir / 'reports'
Y_train = pickle.load(open(features_path / 'training_y.pkl', 'rb'))
predictions_train = json.load(open(predictions_path / 'predictions_train.json'))
predictions_train contains results for the following models:
for i, p in enumerate(predictions_train):
print('%d. %s' % (i, p['name']))
The hyperparameters for the GD models above are:
predictions_train[4]['hparams']
ClosedForm_no_text_train and GradientDescent_no_text_train:¶cf_train_no_text = predictions_train[3]
gd_train_no_text = predictions_train[7]
cf_prediction = cf_train_no_text['y_predicted']
gd_prediction = gd_train_no_text['y_predicted']
trace1 = go.Scatter(
x=[i for i in range(1, 10001)],
y=[y for [y] in Y_train.tolist()],
name = 'Target',
line = dict(
color='#F9A746',
width=1),
connectgaps=True
)
trace2 = go.Scatter(
x=[i for i in range(1, 10001)],
y=cf_prediction,
name = 'Closed form',
line = dict(
color='#E85285',
width=1),
connectgaps=True
)
trace3 = go.Scatter(
x=[i for i in range(1, 10001)],
y=gd_prediction,
name = 'Gradient descent',
line = dict(
color='#6A1B9A',
width=1),
connectgaps=True
)
data = [trace1, trace2, trace3]
layout = dict(title='Popularity Prediction on Training Data',
xaxis=dict(title='Example no.'),
yaxis=dict(title='Popularity score'),
plot_bgcolor = '#151516')
fig = dict(data=data, layout=layout)
iplot(fig)
Observations:
cf_error = (np.array([[p] for p in cf_prediction]) - Y_train)**2
gd_error = (np.array([[p] for p in gd_prediction]) - Y_train)**2
trace1 = go.Scatter(
x=[i for i in range(1, 10001)],
y=[e for [e] in cf_error.tolist()],
name = 'Closed form',
line = dict(
color='#E85285',
width=0.5),
connectgaps=True
)
trace2 = go.Scatter(
x=[i for i in range(1, 10001)],
y=[e for [e] in gd_error.tolist()],
name = 'Gradient descent',
line = dict(
color='#6A1B9A',
width=0.5),
connectgaps=True
)
data = [trace1, trace2]
layout = dict(title='Squared Error for Each Training Example',
xaxis=dict(title='Example no.'),
yaxis=dict(title='Squared error'))
fig = dict(data=data, layout=layout)
iplot(fig)
Not much to say about this chart that hasn't been said. It was plotly practice, I guess.
predictions_validate = json.load(open(predictions_path / 'predictions_validate.json'))
for i, p in enumerate(predictions_validate):
print('%d. %s' % (i, p['name']))
cf_validate_no_text = predictions_validate[3]
gd_validate_no_text = predictions_validate[7]
cf_prediction = cf_validate_no_text['y_predicted']
gd_prediction = gd_validate_no_text['y_predicted']
trace1 = go.Scatter(
x=[i for i in range(1, 1001)],
y=[y for [y] in Y_train.tolist()],
name = 'Target',
line = dict(
color='#F9A746',
width=1),
connectgaps=True
)
trace2 = go.Scatter(
x=[i for i in range(1, 1001)],
y=cf_prediction,
name = 'Closed form',
line = dict(
color='#E85285',
width=1),
connectgaps=True
)
trace3 = go.Scatter(
x=[i for i in range(1, 1001)],
y=gd_prediction,
name = 'Gradient descent',
line = dict(
color='#6A1B9A',
width=1),
connectgaps=True
)
data = [trace1, trace2, trace3]
layout = dict(title='Popularity Prediction on Validation Data',
xaxis=dict(title='Example no.'),
yaxis=dict(title='Popularity score'),
plot_bgcolor = '#151516')
fig = dict(data=data, layout=layout)
iplot(fig)
Many popularity peaks are not captured by the model, which reinforces the underfitting hypothesis
trace = go.Table(
header=dict(values=['<b>Set</b>',
'<b>Model</b>',
'<b>MSE</b>',
'<b>Hyperparameters</b>',
'<b>Iterations</b>']),
cells=dict(values=[['Training', 'Training', 'Validation', 'Validation'],
['Closed Form', 'Gradient Descent', 'Closed Form', 'Gradient Descent'],
[cf_train_no_text['mse'],
gd_train_no_text['mse'],
cf_validate_no_text['mse'],
gd_validate_no_text['mse']],
['', str(gd_train_no_text['hparams']),
'', str(gd_validate_no_text['hparams'])],
['', gd_train_no_text['num_iterations'],
'', gd_validate_no_text['num_iterations']]
]))
layout = dict(title='Predictions Without Text Features')
data = [trace]
iplot(dict(data=data,layout=layout))
The summary table seems to suggest there is no underfitting, as the models performed better on the validation set than on the training set. However, it is possible it is due to random sampling and noise.
The CF's weight vector is computed from a fixed formula - it will therefore give the same prediction for the same inputs.
On the other hand, GD's prediction depends on the hyperparameters set at training stage - different GD models will thus yield different predictions even for the same inputs.
This means GD is more unstable than CF.
Comparisons will be made using the closed form model.
cf_train_160 = predictions_train[1]
cf_train_60 = predictions_train[2]
cf_validate_160 = predictions_validate[1]
cf_validate_60 = predictions_validate[2]
trace = go.Table(
header=dict(values=['',
'<b>No Text</b>',
'<b>Top 60</b>',
'<b>Top 160</b>']),
cells=dict(values=[['<b>Training</b>','<b>Validation</b>'],
[cf_train_no_text['mse'], cf_validate_no_text['mse']],
[cf_train_60['mse'], cf_validate_60['mse']],
[cf_train_160['mse'], cf_validate_160['mse']]
]))
layout = dict(title='Closed Form MSE')
data = [trace]
iplot(dict(data=data,layout=layout))
cf_y_no_text = cf_validate_no_text['y_predicted']
cf_y_60 = cf_validate_60['y_predicted']
cf_y_160 = cf_validate_160['y_predicted']
trace1 = go.Scatter(
x=[i for i in range(1, 1001)],
y=[y for [y] in Y_train.tolist()],
name = '(white) Target',
line = dict(
color='white',
width=1),
connectgaps=True
)
trace2 = go.Scatter(
x=[i for i in range(1, 1001)],
y=cf_y_no_text,
name = 'No Text',
line = dict(
color='#F9A746',
width=1),
connectgaps=True
)
trace3 = go.Scatter(
x=[i for i in range(1, 1001)],
y=cf_y_60,
name = 'Top 60',
line = dict(
color='#E85285',
width=1),
connectgaps=True
)
trace4 = go.Scatter(
x=[i for i in range(1, 1001)],
y=cf_y_160,
name = 'Top 160',
line = dict(
color='#6A1B9A',
width=1),
connectgaps=True
)
data = [trace1, trace2, trace3, trace4]
layout = dict(title='Closed Form Predictions on Validation Data',
xaxis=dict(title='Example no.'),
yaxis=dict(title='Popularity score'),
plot_bgcolor = '#151516'
)
fig = dict(data=data, layout=layout)
iplot(fig)
Judging from the MSEs, top-60 is an incremental improvement over no text, and top-160 is an incremental improvement over top-60.
Even though top-160's validation MSE is the only one higher than its training's, it's a small difference (1.0477763217987115 and 1.0686390774956736).
The fact that top-160's validation MSE is lower than no text's is demonstrative that top-60 gives the best predictions out of the three.
Adding the top 160 words features overall improves our model.
We'll vary the complexity of the model to find the best performer.
Specifically, we'll vary the size of the 'stem' vector in the features from 160 to 1.
Ideally, the validation MSE will hit a minimum at a certain degree of freedom, and this is the degree of freedom we will use in our final model.
src_path = project_dir / 'src'
# Workaround for 'from src.models import ClosedForm'
import importlib.util
spec = importlib.util.spec_from_file_location("src.models", str(src_path / 'models' / 'models.py'))
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
ClosedForm = module.ClosedForm
X_train = pickle.load(open(features_path / 'training_X.pkl', 'rb'))
X_validate = pickle.load(open(features_path / 'validation_X.pkl', 'rb'))
"""
Structure of a row of X_train:
index 0: is_root
1: controversiality
2: children
3: length
4-163: x_counts
164-323: stem
324: bias term
"""
stem_start = 164
X_train_list = [
np.array([ # 1 X matrix
np.concatenate(( # One row
X_train[j][:stem_start], # Begining of row
X_train[j][stem_start:stem_start + i], # Stem
np.array([1]) # Bias term of row
))
for j in range(X_train.shape[0]) # 10000 rows to create 1 X matrix
])
for i in range(160, 0, -1) # 160 X matrices with varying stem vector sizes
]
X_validate_list = [
np.array([ # 1 X matrix
np.concatenate(( # One row
X_validate[j][:stem_start], # Begining of row
X_validate[j][stem_start:stem_start + i], # Stem
np.array([1]) # Bias term of row
))
for j in range(X_validate.shape[0]) # 1000 rows to create 1 X matrix
])
for i in range(160, 0, -1) # 160 X matrices with varying stem vector sizes
]
# Reversing list to get ascending stem vector sizes
X_train_list.reverse()
X_validate_list.reverse()
Y_validate = pickle.load(open(features_path / 'validation_y.pkl', 'rb'))
# The comparison is already saved to an external file.
# If the file isn't there, recreate it
# WARNING: THIS TAKES SO LONG
if (predictions_path / 'complexity_comparison.json').is_file():
complexity_comparisons = json.load(open(predictions_path / 'complexity_comparison.json'))
else:
complexity_comparisons = []
for i, (X_train, X_validate) in enumerate(zip(X_train_list, X_validate_list)):
model = ClosedForm().train(X_train, Y_train)
complexity_comparisons.append({
'stem vector size': i + 1,
'train mse': model.mse(X_train, Y_train),
'validate mse': model.mse(X_validate, Y_validate),
})
json.dump(complexity_comparisons, open(predictions_path / 'complexity_comparison.json', 'w'))
# Create dataframe
df = pd.read_json(json.dumps(complexity_comparisons))
With ~140 stop words removed:
df[['stem vector size',
'train mse',
'validate mse']].set_index(
'stem vector size').iplot(
mode='lines',
xTitle='Model Complexity (Stem Vector Size)',
yTitle='MSE',
title='MSE Against Model Complexity',
)
Horrible.
With no stop words removed:
df[['stem vector size',
'train mse',
'validate mse']].set_index(
'stem vector size').iplot(
mode='lines',
xTitle='Model Complexity (Stem Vector Size)',
yTitle='MSE',
title='MSE Against Model Complexity',
)
With stop words, the model performs worse than without e.g. final mse is 1.9 (with stop words) compared to 1.4 (without stop words).
With ~70 stop words removed:
df[['stem vector size',
'train mse',
'validate mse']].set_index(
'stem vector size').iplot(
mode='lines',
xTitle='Model Complexity (Stem Vector Size)',
yTitle='MSE',
title='MSE Against Model Complexity',
)
3 tries and MSE just gets worse. This feature's a bust.
For the final model, we'll keep it at 42 degrees of freedom with all ~140 stop words removed.
cf_train = predictions_train[0]
cf_validate = predictions_validate[0]
trace = go.Table(
header=dict(values=['',
'<b>Top 160</b>',
'<b>Full</b> (top 160, length and stemming)']),
cells=dict(values=[['<b>Training</b>','<b>Validation</b>'],
[cf_train_160['mse'], cf_validate_160['mse']],
[cf_train['mse'], cf_validate['mse']]
]))
layout = dict(title='Closed Form MSE')
data = [trace]
iplot(dict(data=data,layout=layout))
predictions_test = json.load(open(predictions_path / 'predictions_test.json'))
for i, p in enumerate(predictions_test):
print('%d. %s' % (i, p['name']))
cf_test_160 = predictions_test[1]
cf_test = predictions_test[0]
trace = go.Table(
header=dict(values=['',
'<b>Top 160</b>',
'<b>Full</b> (top 160, length and stemming)']),
cells=dict(values=[['<b>Training</b>','<b>Validation</b>', '<b>Test</b>'],
[cf_train_160['mse'], cf_validate_160['mse'], cf_test_160['mse']],
[cf_train['mse'], cf_validate['mse'], cf_test['mse']]
]))
layout = dict(title='Closed Form MSE')
data = [trace]
iplot(dict(data=data,layout=layout))
cf_y_160 = cf_test_160['y_predicted']
cf_y = cf_test['y_predicted']
trace1 = go.Scatter(
x=[i for i in range(1, 1001)],
y=[y for [y] in Y_train.tolist()],
name = 'Target',
line = dict(
color='#F9A746',
width=1),
connectgaps=True
)
trace2 = go.Scatter(
x=[i for i in range(1, 1001)],
y=cf_y,
name = 'Full',
line = dict(
color='#E85285',
width=1),
connectgaps=True
)
trace3 = go.Scatter(
x=[i for i in range(1, 1001)],
y=cf_y_160,
name = 'Top 160',
line = dict(
color='#6A1B9A',
width=1),
connectgaps=True
)
data = [trace1, trace2, trace3]
layout = dict(title='Closed Form Predictions on Test Data',
xaxis=dict(title='Example no.'),
yaxis=dict(title='Popularity score'),
plot_bgcolor = '#151516'
)
fig = dict(data=data, layout=layout)
iplot(fig)
The full model (2 new features) only outperforms the top-160 model with the training set, and is worse in both with the validation and test set
Our best model overfits and I'm sad.